{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "import lightgbm as lgb\n",
    "from functools import reduce\n",
    "import datetime\n",
    "import os\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def ftrans(num):\n",
    "    num = int(num)\n",
    "    if num == 0:\n",
    "        return 0\n",
    "    if num == 1000:\n",
    "        return 1\n",
    "    if num == 1500:\n",
    "        return 2\n",
    "    if num == 2000:\n",
    "        return 3\n",
    "\n",
    "def trans(num):\n",
    "    num = int(num)\n",
    "    if num == 0:\n",
    "        return 0\n",
    "    if num == 1:\n",
    "        return 1000\n",
    "    if num == 2:\n",
    "        return 1500\n",
    "    if num == 3:\n",
    "        return 2000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(21605, 33) (21631, 32) (21631, 29) (21631, 13) (21631, 11) (21631, 13) (21631, 31) (21631, 13)\n"
     ]
    }
   ],
   "source": [
    "where = pd.read_csv(\"card_where_amount_top.csv\",encoding=\"GB18030\")#33\n",
    "consume = pd.read_csv(\"card_consume_amount_top.csv\",encoding=\"GB18030\")#29\n",
    "time = pd.read_csv(\"card_time_amount_top.csv\",encoding=\"GB18030\")#32\n",
    "time_DayofMonth = pd.read_csv(\"card_time_DayofMonth_amount_top.csv\",encoding=\"GB18030\")#13\n",
    "time_DayofWeek = pd.read_csv(\"card_time_DayofWeek_amount_top.csv\",encoding=\"GB18030\")#11\n",
    "time_HourofDay = pd.read_csv(\"card_time_HourofDay_amount_top.csv\",encoding=\"GB18030\")#13\n",
    "time_MonthofYeah = pd.read_csv(\"card_time_MonthofYeah_amount_top.csv\",encoding=\"GB18030\")#31\n",
    "time_DayofYeah = pd.read_csv(\"card_time_DayofYeah_amount_top.csv\",encoding=\"GB18030\")#13\n",
    "print where.shape,time.shape,consume.shape,time_DayofMonth.shape,time_DayofWeek.shape,time_HourofDay.shape,time_MonthofYeah.shape,time_DayofYeah.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def topk(k1,k2,k3,k4,k5,k6,k7,k8):\n",
    "    where = pd.read_csv(\"card_where_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k1]\n",
    "    consume = pd.read_csv(\"card_consume_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k2]\n",
    "    time = pd.read_csv(\"card_time_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k3]\n",
    "    time_DayofMonth = pd.read_csv(\"card_time_DayofMonth_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k4]\n",
    "    time_DayofWeek = pd.read_csv(\"card_time_DayofWeek_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k5]\n",
    "    time_HourofDay = pd.read_csv(\"card_time_HourofDay_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k6]\n",
    "    time_MonthofYeah = pd.read_csv(\"card_time_MonthofYeah_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k7]\n",
    "    time_DayofYeah = pd.read_csv(\"card_time_DayofYeah_amount_top.csv\",encoding=\"GB18030\").iloc[:,:k8]\n",
    "    return where,time,consume,time_DayofMonth,time_DayofWeek,time_HourofDay,time_MonthofYeah,time_DayofYeah"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "score_train = pd.read_table('data/train/score_train.txt',encoding=\"GB18030\",sep=',',header=-1)\n",
    "score_train.columns = ['id','college','score']\n",
    "score_test = pd.read_table('data/test/score_final_test.txt',encoding=\"GB18030\",sep=',',header=-1)\n",
    "score_test.columns = ['id','college','score']\n",
    "score_train_test = pd.concat([score_train,score_test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "college_train_test = score_train_test.groupby([\"id\",\"college\"])[\"score\"].max().unstack([\"college\"]).fillna(0)\n",
    "college_name_list = list(college_train_test.columns)\n",
    "college_train_test = college_train_test.rename(columns=dict(zip(college_name_list,[\"college_\"+str(i) for i in college_name_list]))).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "college = pd.DataFrame(score_train_test.groupby(['college'])['score'].max()).reset_index()\n",
    "college.columns = ['college','college_num']\n",
    "score_train_test = pd.merge(score_train_test, college, how='left',on='college')\n",
    "score_train_test['college_orderate'] = score_train_test['score']/score_train_test['college_num']\n",
    "del score_train_test[\"college\"]\n",
    "#del score_train_test[\"score\"]\n",
    "score_train_test = pd.merge(score_train_test,college_train_test,how=\"left\",on=\"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "card_train = pd.read_table('data/train/card_train.txt',sep=',',header=-1)\n",
    "card_train.columns = ['id','consume','where','how','time','amount','remainder']\n",
    "card_test = pd.read_table('data/test/card_final_test.txt',sep=',',header=-1)\n",
    "card_test.columns = ['id','consume','where','how','time','amount','remainder']\n",
    "card_train_test = pd.concat([card_train,card_test])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "card = pd.DataFrame(card_train_test.groupby(['id'])[\"time\"].count()).reset_index().rename(columns={\"time\":\"cardActionNum\"})\n",
    "card['remainderavg'] = card_train_test.groupby(['id'])['remainder'].mean()\n",
    "card['remaindermax'] = card_train_test.groupby(['id'])['remainder'].max()\n",
    "card['remaindermin'] = card_train_test.groupby(['id'])['remainder'].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def singleFeatureDeal(str1,feature):\n",
    "    x_count = card_train_test.groupby([\"id\",str1])[feature].count().unstack([str1])\n",
    "    x_count = x_count.rename(columns=dict(zip(x_count,[str(i)+\"_count\" for i in x_count])))\n",
    "\n",
    "    x_sum = card_train_test.groupby([\"id\",str1])[feature].sum().unstack([str1])\n",
    "    x_sum = x_sum.rename(columns=dict(zip(x_sum,[str(i)+\"_sum\" for i in x_sum])))\n",
    "\n",
    "    x_mean = card_train_test.groupby([\"id\",str1])[feature].mean().unstack([str1])\n",
    "    x_mean = x_mean.rename(columns=dict(zip(x_mean,[str(i)+\"_mean\" for i in x_mean])))\n",
    "\n",
    "    x_max = card_train_test.groupby([\"id\",str1])[feature].max().unstack([str1])\n",
    "    x_max = x_max.rename(columns=dict(zip(x_max,[str(i)+\"_max\" for i in x_max])))\n",
    "\n",
    "    x_min = card_train_test.groupby([\"id\",str1])[feature].min().unstack([str1])\n",
    "    x_min = x_min.rename(columns=dict(zip(x_min,[str(i)+\"_min\" for i in x_min])))\n",
    "\n",
    "    x_median = card_train_test.groupby([\"id\",str1])[feature].median().unstack([str1])\n",
    "    x_median = x_median.rename(columns=dict(zip(x_median,[str(i)+\"_median\" for i in x_median])))\n",
    "    \n",
    "    x = pd.concat([x_count,x_sum,x_mean,x_max,x_min,x_median],axis=1).reset_index().fillna(0)\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "card_how_amount = singleFeatureDeal(\"how\",\"amount\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def Pipeline(seed,num,k1,k2,k3,k4,k5,k6,k7,k8,dropList1,dropList2):\n",
    "      \n",
    "    # train_test\n",
    "    train = pd.read_table('data/train/subsidy_train.txt',encoding=\"GB18030\",sep=',',header=-1)\n",
    "    train.columns = ['id','money']\n",
    "    test = pd.read_table('data/test/subsidy_final_test.txt',encoding=\"GB18030\",sep=',',header=-1)\n",
    "    test.columns = ['id']\n",
    "    test['money'] = np.nan\n",
    "    train_test = pd.concat([train,test])\n",
    "    \n",
    "    where,time,consume,time_DayofMonth,time_DayofWeek,time_HourofDay,time_MonthofYeah,time_DayofYeah = topk(k1,k2,k3,k4,k5,k6,k7,k8)\n",
    "    print where.shape,time.shape,consume.shape,time_DayofMonth.shape,time_DayofWeek.shape,time_HourofDay.shape,time_MonthofYeah.shape,time_DayofYeah.shape\n",
    "    train_test = pd.merge(train_test,score_train_test,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,card,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,card_how_amount,how='left',on='id')\n",
    "    \n",
    "    train_test = pd.merge(train_test,where,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,consume,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,time,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,time_DayofMonth,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,time_DayofWeek,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,time_HourofDay,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,time_MonthofYeah,how='left',on='id')\n",
    "    train_test = pd.merge(train_test,time_DayofYeah,how='left',on='id')\n",
    "    \n",
    "    \n",
    "    train_test.drop(dropList1,axis=1, inplace=True)\n",
    "    train_test.drop(dropList2,axis=1, inplace=True)   \n",
    "    \n",
    "    train = train_test.loc[train_test.money.notnull(),:]\n",
    "    test = train_test.loc[train_test.money.isnull(),:]\n",
    "    \n",
    "    #### Oversample\n",
    "    del test[\"money\"]\n",
    "    Oversampling1000 = train.loc[train.money == 1000]\n",
    "    Oversampling1500 = train.loc[train.money == 1500]\n",
    "    Oversampling2000 = train.loc[train.money == 2000]\n",
    "    for i in range(8):\n",
    "        train = train.append(Oversampling1000)\n",
    "    for j in range(11):\n",
    "        train = train.append(Oversampling1500)\n",
    "    for k in range(12):\n",
    "        train = train.append(Oversampling2000)\n",
    "\n",
    "    # model\n",
    "    train = train.set_index([\"id\"]).fillna(-1)\n",
    "    test = test.set_index([\"id\"]).fillna(-1)\n",
    "    train['money'] = train['money'].apply(lambda x: ftrans(x))\n",
    "    \n",
    "    \n",
    "    np.random.seed(seed)\n",
    "    X_train = train.iloc[np.random.permutation(len(train))]\n",
    "    y_train = X_train.money\n",
    "    X_train = X_train.drop(['money'],axis=1)\n",
    "    \n",
    "    xgb_train   = xgb.DMatrix(X_train,y_train)\n",
    "    xgb_test = xgb.DMatrix(test)\n",
    "    \n",
    "    xgb_params = {\n",
    "        'booster': 'gbtree',\n",
    "        #'objective': 'multi:softmax',\n",
    "        'objective': 'multi:softprob',\n",
    "        'num_class': 4,\n",
    "        'max_depth': 5,\n",
    "        'learning_rate': 0.04,\n",
    "        'colsample_bytree': 0.8,\n",
    "        'subsample': 0.8,\n",
    "        'eval_metric': 'merror',\n",
    "        'silent': 1,\n",
    "    }\n",
    "    \n",
    "    watchlist = [(xgb_train,'train')]\n",
    "    bst = xgb.train(xgb_params,xgb_train,100,watchlist,verbose_eval=False)\n",
    "    yprob = bst.predict(xgb_test).reshape( test.shape[0], 4 )\n",
    "    ylabel = np.argmax(yprob, axis=1)\n",
    "    pd.DataFrame(yprob).to_csv(\"xgb_prob_model1_seed{}_{}.csv\".format(seed,num))\n",
    "    \n",
    "    pred = pd.DataFrame({'studentid': test.reset_index()['id'],\n",
    "                             'subsidy': ylabel})\n",
    "    pred['subsidy'] = pred['subsidy'].apply(lambda x: trans(x))\n",
    "    pred.to_csv(\"./xgb_pred_seed{}_{}.csv\".format(seed,num), index=False)\n",
    "    #print \"数据量:\", train.shape[0], \":\", pred.shape[0]\n",
    "    #print \"有奖数量:\", sum(train['money'] != 0), \":\", sum(pred['subsidy'] != 0)\n",
    "    print \"1000奖学金数量：\", sum(train['money'] == 1000), sum(pred['subsidy'] == 1000)\n",
    "    print \"1500奖学金数量：\", sum(train['money'] == 1500), sum(pred['subsidy'] == 1500)\n",
    "    print \"2000奖学金数量：\", sum(train['money'] == 2000), sum(pred['subsidy'] == 2000)\n",
    " \n",
    "    pd.Series(bst.get_fscore()).reset_index().sort_values(0).to_csv(\"xgb_feature_seed{}_{}.csv\".format(seed,num),encoding=\"GB18030\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(21778, 15) (21806, 15) (21806, 15) (21806, 13) (21806, 11) (21806, 13) (21806, 15) (21806, 13)\n",
      "1000奖学金数量： 0 2326\n",
      "1500奖学金数量： 0 696\n",
      "2000奖学金数量： 0 288\n"
     ]
    }
   ],
   "source": [
    "\n",
    "dropList1 = [\"校医院_count\",\"其他_sum\",\"其他_min\",\"校医院_min\",\"校医院_max\",\"校医院_median\",\"文印中心_median\",\"洗衣房_min\",\n",
    "           \"开水_min\",\"college_13\",\"教务处_max\",\"其他_sum\",\"其他_median\",\"其他_max\",\"college_1\",\"college_12\",\"college_18\"]\n",
    "#dropList1 =[]\n",
    "dropList2 = []\n",
    "Pipeline(100,1,15,15,15,15,15,15,15,15,dropList1,dropList2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(21778, 15) (21806, 15) (21806, 15) (21806, 13) (21806, 11) (21806, 13) (21806, 15) (21806, 13)\n",
      "1000奖学金数量： 0 2355\n",
      "1500奖学金数量： 0 676\n",
      "2000奖学金数量： 0 275\n"
     ]
    }
   ],
   "source": [
    "\n",
    "dropList1 = [\"校医院_count\",\"其他_sum\",\"其他_min\",\"校医院_min\",\"校医院_max\",\"校医院_median\",\"文印中心_median\",\"洗衣房_min\",\n",
    "           \"开水_min\",\"college_13\",\"教务处_max\",\"其他_sum\",\"其他_median\",\"其他_max\",\"college_1\",\"college_12\",\"college_18\"]\n",
    "#dropList1 =[]\n",
    "dropList2 = []\n",
    "Pipeline(4000,2,15,15,15,15,15,15,15,15,dropList1,dropList2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
